import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


df = pd.read_csv("./NASA.csv")
df.head()


sns.heatmap(df.isnull(), yticklabels=False, cbar=False, cmap='viridis')
plt.title("Heatmap of NaNs in NASA Dataset")
plt.show()


np.sum(df.isnull())

id                    0
name                  0
est_diameter_min      0
est_diameter_max      0
relative_velocity     0
miss_distance         0
orbiting_body         0
sentry_object         0
absolute_magnitude    0
hazardous             0
dtype: int64


sentry_dist = df['sentry_object'].value_counts()
sentry_dist.values

array([90836])


plt.pie(sentry_dist.values, labels=sentry_dist.keys())
plt.title('Relative Frequency of Sentry Objects')
plt.legend()
plt.show()


orbit_dist = df['orbiting_body'].value_counts()

plt.hist(orbit_dist.values, label=orbit_dist.keys())
plt.title('Relative Frequency of Orbiting Bodies')
plt.legend()
plt.show()


df_dropped = df.drop(labels=['id', 'orbiting_body', 'sentry_object'], axis=1)
df_dropped.head()


def extract_year(name: str) -> int:
    start_index = name.find("(") + 1
    year_chars = list(name)[start_index:start_index + 4]
    return int(''.join(year_chars))

  
def locate_names_with_no_year(names: str) -> str:
    years, nonyears = [], []
    for index, name in enumerate(names):
        try:
            years.append(extract_year(name))
        except ValueError:
            nonyears.append((index, name))
    return years, nonyears


years, year_not_found = locate_names_with_no_year(df_dropped["name"])
year_not_found

[(1847, '719 Albert (A911 TB)'),
 (12709, '433 Eros (A898 PA)'),
 (36418, '1036 Ganymed (A924 UB)'),
 (37651, '433 Eros (A898 PA)'),
 (56533, '433 Eros (A898 PA)'),
 (73482, '(A/2019 Q2)')]


aggregate_hazard_yrs = years_and_hazards.groupby("Year", sort=True).sum()


plt.plot(aggregate_hazard_yrs.index,
         aggregate_hazard_yrs.values)
plt.title("Number of Hazardous NEOs Annually")
plt.ylabel('# Hazardous NEOs')
plt.xlabel('Year')
plt.show()


df_dropped2 = df_dropped.drop(labels=["name"], axis=1)
df_dropped2.head()


df_dropped2.describe()


sns.heatmap(X_train.corr(), annot=True)
plt.title("Correlation Matrix of NASA Dataset")
plt.show()


X_train = X_train.drop("est_diameter_max", axis=1)
X_test = X_test.drop("est_diameter_max", axis=1)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC


param_grid1 = {
    "kernel": ['linear', 'poly', 'rbf', 'sigmoid'],
    "class_weight": [None, "balanced"]  # this is less important to search for right now
}


# grid1 = GridSearchCV(SVC(),
#                      param_grid1,
#                      refit=True,
#                      cv=2,  # skipping cross validation for now
#                      verbose=2)
# grid1.fit(X_train_scaled, np.squeeze(y_train))


param_grid2 = {
    "class_weight": [None, "balanced"],
    "penalty": ["l2"],  # let's avoid l1, b/c we've already selected features 
    "random_state": [42],  # for reproducibility purposes
    "average": [True, False],
    "max_iter": [5000],
    "alpha": [.0001, .01, 10], # regularization param
    "learning_rate": ["optimal", "adaptive"],
    "eta0": [0.001],  # we can tune this further, but for now I
                      # just want to see if using an 
                      # adaptive schedule vs. an optimal one has any value
                      # in the first place
    # these next two will help us not waste compute time
    "early_stopping":  [True],
    "validation_fraction": [0.1]
}


grid2 = GridSearchCV(SGDClassifier(loss="hinge"),
                     param_grid2,
                     refit=True,
                     cv=5,  # skipping cross validation for now
                     verbose=2)
grid2.fit(X_train_scaled, np.squeeze(y_train))

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END alpha=0.0001, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=True, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=True, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=True, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=True, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=True, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=True, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=True, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.2s
[CV] END alpha=0.0001, average=True, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=True, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.2s
[CV] END alpha=0.0001, average=True, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.2s
[CV] END alpha=0.0001, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.0s
[CV] END alpha=0.0001, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=False, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=False, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=False, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=False, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=False, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=False, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.2s
[CV] END alpha=0.0001, average=False, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=False, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=False, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=False, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.0s
[CV] END alpha=0.01, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=True, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=True, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=True, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=True, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=True, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=True, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=True, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.2s
[CV] END alpha=0.01, average=True, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=True, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=True, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.0s
[CV] END alpha=0.01, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.0s
[CV] END alpha=0.01, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=False, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=False, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=False, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=False, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=False, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=False, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=False, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=False, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=False, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=False, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.0s
[CV] END alpha=10, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.2s
[CV] END alpha=10, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.2s
[CV] END alpha=10, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=True, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=True, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=True, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=True, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=True, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=True, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=True, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=True, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=True, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=True, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.0s
[CV] END alpha=10, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=False, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.0s
[CV] END alpha=10, average=False, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=False, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.0s
[CV] END alpha=10, average=False, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.0s
[CV] END alpha=10, average=False, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.0s
[CV] END alpha=10, average=False, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=False, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=False, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=False, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=False, class_weight=balanced, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s

GridSearchCV(cv=5, estimator=SGDClassifier(),
             param_grid={'alpha': [0.0001, 0.01, 10], 'average': [True, False],
                         'class_weight': [None, 'balanced'],
                         'early_stopping': [True], 'eta0': [0.001],
                         'learning_rate': ['optimal', 'adaptive'],
                         'max_iter': [5000], 'penalty': ['l2'],
                         'random_state': [42], 'validation_fraction': [0.1]},
             verbose=2)

GridSearchCV(cv=5, estimator=SGDClassifier(),
             param_grid={'alpha': [0.0001, 0.01, 10], 'average': [True, False],
                         'class_weight': [None, 'balanced'],
                         'early_stopping': [True], 'eta0': [0.001],
                         'learning_rate': ['optimal', 'adaptive'],
                         'max_iter': [5000], 'penalty': ['l2'],
                         'random_state': [42], 'validation_fraction': [0.1]},
             verbose=2)

SGDClassifier()

SGDClassifier()


grid2.best_params_

{'alpha': 0.0001,
 'average': True,
 'class_weight': None,
 'early_stopping': True,
 'eta0': 0.001,
 'learning_rate': 'optimal',
 'max_iter': 5000,
 'penalty': 'l2',
 'random_state': 42,
 'validation_fraction': 0.1}


from sklearn.metrics import (
    classification_report,
    ConfusionMatrixDisplay
)


ConfusionMatrixDisplay.from_estimator(grid2.best_estimator_,
                                      X_test_scaled, y_test)
plt.title("Linear SVM (SGD) Confusion Matrix")
plt.show()


print(
    classification_report(y_test,
                          grid2.best_estimator_.predict(X_test_scaled))
)

              precision    recall  f1-score   support

           0       0.90      1.00      0.95     16439
           1       0.00      0.00      0.00      1729

    accuracy                           0.90     18168
   macro avg       0.45      0.50      0.48     18168
weighted avg       0.82      0.90      0.86     18168

/usr/local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))


params_to_use = grid2.best_params_.copy()
params_to_use["class_weight"] = "balanced"
model3 = SGDClassifier(**params_to_use)
model3 = model3.fit(X_train_scaled, np.squeeze(y_train))


ConfusionMatrixDisplay.from_estimator(model3,
                                      X_test_scaled,
                                      y_test)
plt.title("Linear SVM (SGD) Confusion Matrix, Balanced")
plt.show()


print(
    classification_report(y_test,
                          model3.predict(X_test_scaled))
)

              precision    recall  f1-score   support

           0       1.00      0.74      0.85     16439
           1       0.29      0.97      0.44      1729

    accuracy                           0.77     18168
   macro avg       0.64      0.86      0.65     18168
weighted avg       0.93      0.77      0.81     18168


from sklearn.kernel_approximation import Nystroem


# these args are totally arbitrary, just copying the docs for now:
# https://scikit-learn.org/stable/modules/generated/sklearn.kernel_approximation.Nystroem.html#sklearn.kernel_approximation.Nystroem

feature_map_nystroem1 = Nystroem(gamma=.2,
                                random_state=1,
                                n_components=300)


best_params4 = {
    'alpha': 0.0001,
    'average': True,
    'class_weight': "balanced",
    'early_stopping': True,
    'eta0': 0.001,
    'learning_rate': 'optimal',
    'max_iter': 5000,
    'penalty': 'l2',
    'random_state': 42,
    'validation_fraction': 0.1
}


X_train_rbf = feature_map_nystroem1.fit_transform(X_train_scaled)
X_test_rbf = feature_map_nystroem1.transform(X_test_scaled)


model4 = SGDClassifier(**best_params4)
model4 = model4.fit(X_train_rbf, np.squeeze(y_train))


ConfusionMatrixDisplay.from_estimator(model4,
                                      X_test_rbf,
                                      y_test)
plt.title("Kernel SVM (RBF, SGD) Confusion Matrix, Balanced")
plt.show()


print(
    classification_report(y_test,
                          model4.predict(X_test_rbf))
)

              precision    recall  f1-score   support

           0       1.00      0.74      0.85     16439
           1       0.29      0.99      0.45      1729

    accuracy                           0.77     18168
   macro avg       0.64      0.87      0.65     18168
weighted avg       0.93      0.77      0.81     18168


from sklearn.linear_model import SGDOneClassSVM


# start by reloading the data, and separating both classes
data = df_dropped2.drop("est_diameter_max", axis=1)
# encode the class column, since I forgot to do before
data["hazardous"] = LabelEncoder().fit_transform(data["hazardous"])
train, test = train_test_split(data, test_size=.2, random_state=42)


train.head()  # sanity check


num_positive = train[train["hazardous"] == 1].shape[0]
outlier_proportion = num_positive / float(train.shape[0])


gammas = [0.00001, 0.001, 0.01, 0.05, 0.1, 1, 5, 10, 100]
gamma_kernel = dict()

for g in gammas:
    feature_map = Nystroem(gamma=g,
                           random_state=1,
                           n_components=300)
    # splitting claases in training data + processing
    train_normal, train_outlier = (
        train[train["hazardous"] == 0],
        train[train["hazardous"] == 1],
    )
    X_train, y_train = (
        train_normal.drop("hazardous", axis=1),
        train_normal["hazardous"],
    )
    X_test, y_test = (  # note: we include both classes in the test data
        test.drop("hazardous", axis=1),
        test["hazardous"],
    )
    scaler = StandardScaler()
    X_train_scaled_normal = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    # apply the kernel
    X_train_rbf = feature_map.fit_transform(X_train_scaled_normal)
    X_test_rbf = feature_map.transform(X_test_scaled)
    gamma_kernel[g] = (X_train_rbf, X_test_rbf, y_train, y_test)


models = dict()

for g, X in gamma_kernel.items():
    X_train_rbf, _, _, _ = X
    svm = SGDOneClassSVM(nu=outlier_proportion,
                         **params_to_use5)
    models[g] = svm.fit(X_train_rbf)


def evaluate_one_class_svm(model, X_test, y_test, gamma=None):
    y_pred = model.predict(X_test)
    # relabel y_pred so -1 --> our positive class, and 1 --> our negatives
    y_pred_transformed = np.where(y_pred == -1, 1, 0)
    # visualize the confusion matrix
    ConfusionMatrixDisplay.from_predictions(y_test, y_pred_transformed)
    plt.title(f"One-Class SVM (RBF) Confusion Matrix, g = {gamma}")
    plt.show()
    print(classification_report(y_test, y_pred_transformed))


results = list()

for g, X in gamma_kernel.items():
    _, X_test_rbf, _, y_test = X
    svm = models[g]
    evaluate_one_class_svm(svm, X_test_rbf, y_test, gamma=g)

              precision    recall  f1-score   support

           0       0.91      1.00      0.95     16439
           1       0.27      0.01      0.02      1729

    accuracy                           0.90     18168
   macro avg       0.59      0.50      0.49     18168
weighted avg       0.84      0.90      0.86     18168

              precision    recall  f1-score   support

           0       0.91      0.91      0.91     16439
           1       0.15      0.15      0.15      1729

    accuracy                           0.84     18168
   macro avg       0.53      0.53      0.53     18168
weighted avg       0.84      0.84      0.84     18168

              precision    recall  f1-score   support

           0       0.91      0.90      0.91     16439
           1       0.15      0.16      0.16      1729

    accuracy                           0.83     18168
   macro avg       0.53      0.53      0.53     18168
weighted avg       0.84      0.83      0.84     18168

              precision    recall  f1-score   support

           0       0.91      0.90      0.91     16439
           1       0.17      0.19      0.18      1729

    accuracy                           0.84     18168
   macro avg       0.54      0.55      0.54     18168
weighted avg       0.84      0.84      0.84     18168

              precision    recall  f1-score   support

           0       0.92      0.90      0.91     16439
           1       0.20      0.22      0.21      1729

    accuracy                           0.84     18168
   macro avg       0.56      0.56      0.56     18168
weighted avg       0.85      0.84      0.84     18168

              precision    recall  f1-score   support

           0       0.92      0.92      0.92     16439
           1       0.25      0.26      0.26      1729

    accuracy                           0.85     18168
   macro avg       0.59      0.59      0.59     18168
weighted avg       0.86      0.85      0.86     18168

              precision    recall  f1-score   support

           0       0.92      0.92      0.92     16439
           1       0.23      0.23      0.23      1729

    accuracy                           0.85     18168
   macro avg       0.57      0.58      0.57     18168
weighted avg       0.85      0.85      0.85     18168

              precision    recall  f1-score   support

           0       0.90      1.00      0.95     16439
           1       0.00      0.00      0.00      1729

    accuracy                           0.90     18168
   macro avg       0.45      0.50      0.48     18168
weighted avg       0.82      0.90      0.86     18168

/usr/local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

/usr/local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))


g = sns.PairGrid(data, hue="hazardous")
g.map_offdiag(sns.scatterplot)
g.map_diag(sns.histplot, multiple='stack')
g.add_legend()

<seaborn.axisgrid.PairGrid at 0x142557b80>


fig, axes = plt.subplots(1, 4, figsize = (20, 5))
fig.suptitle("Features vs Target Variable")
sns.boxenplot(ax=axes[0], x='hazardous',y='est_diameter_min',data=data)
sns.boxenplot(ax=axes[1], x='hazardous',y='relative_velocity',data=data)
sns.boxplot(ax=axes[2], x='hazardous',y='miss_distance',data=data)
sns.boxplot(ax=axes[3], x='hazardous',y='absolute_magnitude',data=data)
plt.show()


X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=42)


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_train_reduced = pca.fit_transform(X_train)
X_test_reduced = pca.transform(X_test)


param_grid2_reduced = {
    "class_weight": [None],
    "penalty": ["l2"],  # let's avoid l1, b/c we've already selected features 
    "random_state": [42],  # for reproducibility purposes
    "average": [True, False],
    "max_iter": [5000],
    "alpha": [.0001, .01, 10], # regularization param
    "learning_rate": ["optimal", "adaptive"],
    "eta0": [0.001],  
    "early_stopping":  [True],
    "validation_fraction": [0.1],
}


grid2_reduced = GridSearchCV(SGDClassifier(loss="hinge"),
                     param_grid2_reduced,
                     refit=True,
                     cv=5,  # skipping cross validation for now
                     verbose=2)
grid2_reduced.fit(X_train_reduced, np.squeeze(y_train))

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END alpha=0.0001, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.0s
[CV] END alpha=0.0001, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.0001, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.2s
[CV] END alpha=0.01, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.3s
[CV] END alpha=0.01, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.2s
[CV] END alpha=0.01, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.0s
[CV] END alpha=0.01, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=0.01, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.2s
[CV] END alpha=0.01, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.2s
[CV] END alpha=10, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.2s
[CV] END alpha=10, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.2s
[CV] END alpha=10, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=True, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.2s
[CV] END alpha=10, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=optimal, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s
[CV] END alpha=10, average=False, class_weight=None, early_stopping=True, eta0=0.001, learning_rate=adaptive, max_iter=5000, penalty=l2, random_state=42, validation_fraction=0.1; total time=   0.1s

GridSearchCV(cv=5, estimator=SGDClassifier(),
             param_grid={'alpha': [0.0001, 0.01, 10], 'average': [True, False],
                         'class_weight': [None], 'early_stopping': [True],
                         'eta0': [0.001],
                         'learning_rate': ['optimal', 'adaptive'],
                         'max_iter': [5000], 'penalty': ['l2'],
                         'random_state': [42], 'validation_fraction': [0.1]},
             verbose=2)

GridSearchCV(cv=5, estimator=SGDClassifier(),
             param_grid={'alpha': [0.0001, 0.01, 10], 'average': [True, False],
                         'class_weight': [None], 'early_stopping': [True],
                         'eta0': [0.001],
                         'learning_rate': ['optimal', 'adaptive'],
                         'max_iter': [5000], 'penalty': ['l2'],
                         'random_state': [42], 'validation_fraction': [0.1]},
             verbose=2)

SGDClassifier()

SGDClassifier()


grid2_reduced.best_params_

{'alpha': 0.0001,
 'average': True,
 'class_weight': None,
 'early_stopping': True,
 'eta0': 0.001,
 'learning_rate': 'adaptive',
 'max_iter': 5000,
 'penalty': 'l2',
 'random_state': 42,
 'validation_fraction': 0.1}


ConfusionMatrixDisplay.from_estimator(
    grid2_reduced.best_estimator_,
    X_test_reduced, y_test
)
plt.title("Linear SVM (SGD) Confusion Matrix (2 Features)")
plt.show()


print(classification_report(y_test, 
                            grid2_reduced.best_estimator_.predict(X_test_reduced)))

              precision    recall  f1-score   support

           0       0.90      1.00      0.95     16439
           1       0.00      0.00      0.00      1729

    accuracy                           0.90     18168
   macro avg       0.45      0.50      0.48     18168
weighted avg       0.82      0.90      0.86     18168

/usr/local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))


params_grid3_reduced = grid2_reduced.best_params_.copy()
params_grid3_reduced['class_weight'] = 'balanced'
model3_reduced = SGDClassifier(**params_grid3_reduced)
model3_reduced.fit(X_train_reduced, y_train)

SGDClassifier(average=True, class_weight='balanced', early_stopping=True,
              eta0=0.001, learning_rate='adaptive', max_iter=5000,
              random_state=42)

SGDClassifier(average=True, class_weight='balanced', early_stopping=True,
              eta0=0.001, learning_rate='adaptive', max_iter=5000,
              random_state=42)


ConfusionMatrixDisplay.from_estimator(model3_reduced,
                                      X_test_reduced, y_test)
plt.title("Linear SVM (SGD) Confusion Matrix, Balanced (2 Features)")
plt.show()


print(classification_report(y_test,
                            model3_reduced.predict(X_test_reduced)))

              precision    recall  f1-score   support

           0       1.00      0.74      0.85     16439
           1       0.28      0.97      0.44      1729

    accuracy                           0.76     18168
   macro avg       0.64      0.86      0.65     18168
weighted avg       0.93      0.76      0.81     18168


params_grid4_reduced = params_grid3_reduced.copy()


feature_map_nystroem_reduced = Nystroem(gamma=.9,
                                random_state=1,
                                n_components=300)


X_train_reduced_rbf = feature_map_nystroem_reduced.fit_transform(X_train_reduced)
X_test_reduced_rbf = feature_map_nystroem_reduced.transform(X_test_reduced)


model4_reduced = SGDClassifier(**params_grid4_reduced)
model4_reduced = model4_reduced.fit(X_train_reduced_rbf, y_train)


ConfusionMatrixDisplay.from_estimator(model4_reduced, X_test_reduced_rbf, y_test)
plt.title("Kernel SVM (RBF, SGD) Confusion Matrix, Balanced (2 Features)")
plt.show()


print(classification_report(y_test,
                            model4_reduced.predict(X_test_reduced_rbf)))

              precision    recall  f1-score   support

           0       1.00      0.73      0.84     16439
           1       0.28      0.99      0.43      1729

    accuracy                           0.75     18168
   macro avg       0.64      0.86      0.64     18168
weighted avg       0.93      0.75      0.80     18168


num_positive = X_train_reduced[y_train == 1].shape[0]
outlier_proportion = num_positive / float(X_train_reduced.shape[0])


gammas = [0.00001, 0.001, 0.01, 0.05, 0.1, 1, 5, 10, 100]
gamma_kernel_reduced = dict()

for g in gammas:
    feature_map = Nystroem(gamma=g,
                           random_state=1,
                           n_components=300)
    # splitting claases in training data + processing
    X_train, y_train = (
        X_train_reduced[y_train == 0], y_train
    )
    X_test, y_test = (  # note: we include both classes in the test data
        X_test_reduced, y_test
    )
    scaler = StandardScaler()
    X_train_scaled_normal = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    # apply the kernel
    X_train_rbf = feature_map.fit_transform(X_train_scaled_normal)
    X_test_rbf = feature_map.transform(X_test_scaled)
    gamma_kernel_reduced[g] = (X_train_rbf, X_test_rbf, y_train, y_test)


params4 = {
    'average': True,
    'eta0': 0.001,
    'learning_rate': 'optimal',
    'max_iter': 5000,
    'random_state': 42,
}


models_reduced = dict()

for g, X in gamma_kernel_reduced.items():
    X_train_rbf, _, _, _ = X
    svm = SGDOneClassSVM(nu=outlier_proportion,
                         **params4)
    models_reduced[g] = svm.fit(X_train_rbf)


for g, X in gamma_kernel_reduced.items():
    _, X_test_rbf, _, y_test = X
    svm = models_reduced[g]
    evaluate_one_class_svm(svm, X_test_rbf, y_test, gamma=g)

              precision    recall  f1-score   support

           0       0.91      1.00      0.95     16439
           1       0.17      0.00      0.01      1729

    accuracy                           0.90     18168
   macro avg       0.54      0.50      0.48     18168
weighted avg       0.83      0.90      0.86     18168

              precision    recall  f1-score   support

           0       0.91      0.92      0.91     16439
           1       0.15      0.14      0.14      1729

    accuracy                           0.84     18168
   macro avg       0.53      0.53      0.53     18168
weighted avg       0.84      0.84      0.84     18168

              precision    recall  f1-score   support

           0       0.91      0.90      0.91     16439
           1       0.14      0.15      0.15      1729

    accuracy                           0.83     18168
   macro avg       0.53      0.53      0.53     18168
weighted avg       0.84      0.83      0.83     18168

              precision    recall  f1-score   support

           0       0.91      0.90      0.91     16439
           1       0.15      0.16      0.16      1729

    accuracy                           0.83     18168
   macro avg       0.53      0.53      0.53     18168
weighted avg       0.84      0.83      0.84     18168

              precision    recall  f1-score   support

           0       0.91      0.90      0.91     16439
           1       0.17      0.19      0.17      1729

    accuracy                           0.83     18168
   macro avg       0.54      0.54      0.54     18168
weighted avg       0.84      0.83      0.84     18168

              precision    recall  f1-score   support

           0       0.92      0.92      0.92     16439
           1       0.22      0.21      0.22      1729

    accuracy                           0.85     18168
   macro avg       0.57      0.57      0.57     18168
weighted avg       0.85      0.85      0.85     18168

              precision    recall  f1-score   support

           0       0.92      0.93      0.93     16439
           1       0.24      0.20      0.22      1729

    accuracy                           0.86     18168
   macro avg       0.58      0.57      0.57     18168
weighted avg       0.85      0.86      0.86     18168

              precision    recall  f1-score   support

           0       0.92      0.93      0.92     16439
           1       0.24      0.22      0.23      1729

    accuracy                           0.86     18168
   macro avg       0.58      0.57      0.57     18168
weighted avg       0.85      0.86      0.86     18168

              precision    recall  f1-score   support

           0       0.90      1.00      0.95     16439
           1       0.00      0.00      0.00      1729

    accuracy                           0.90     18168
   macro avg       0.45      0.50      0.48     18168
weighted avg       0.82      0.90      0.86     18168

/usr/local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))


from sklearn.tree import DecisionTreeClassifier

x=data.drop(columns='hazardous')
y=data.hazardous 


x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,stratify=y, random_state = 42)

DT = DecisionTreeClassifier()

DT_model = DT.fit(x_train,y_train)
y_pred = DT_model.predict(x_test)

evaluation(y_test,y_pred)

Classification report:
               precision    recall  f1-score   support

           0       0.94      0.94      0.94     16400
           1       0.43      0.45      0.44      1768

    accuracy                           0.89     18168
   macro avg       0.69      0.69      0.69     18168
weighted avg       0.89      0.89      0.89     18168


from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier()

RF_model = RF.fit(x_train,y_train)
y_pred = RF_model.predict(x_test)

evaluation(y_test,y_pred)

Classification report:
               precision    recall  f1-score   support

           0       0.94      0.97      0.95     16400
           1       0.61      0.38      0.47      1768

    accuracy                           0.92     18168
   macro avg       0.77      0.68      0.71     18168
weighted avg       0.90      0.92      0.91     18168


from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.pipeline import make_pipeline
from matplotlib.colors import ListedColormap


gamma = 1
_, X_test, _, y_test = gamma_kernel_reduced[gamma]
svm = models_reduced[gamma]

pipe = make_pipeline(feature_map_nystroem_reduced, svm)

red_green_colormap = ListedColormap(["green", "red"])


def plot_svm_2D(title, clf, X_test, y_test, gamma):
    '''Shows the plot of our SVM in color.'''
    # Set-up 2x2 grid for plotting.
    fig, ax = plt.subplots(figsize=(6, 6))
    # plot the samples and decision boundary
    X0, X1 = X_test[:, 0], X_test[:, 1]

    disp = DecisionBoundaryDisplay.from_estimator(
        clf,
        X_test,
        response_method="predict",
        cmap=red_green_colormap,
        alpha=0.8,
        ax=ax,
        xlabel="PC 1",
        ylabel="PC 2",
        plot_method="contour",
        grid_resolution=100
    )
    scatter = ax.scatter(X0, X1,
               c=y_test, 
               cmap=red_green_colormap,
               edgecolor="black",
               s=20, edgecolors='k')
    #add legend
    lines, _ = scatter.legend_elements()
    labels = ["Non-hazardous", "Hazardous"]
    new_legend_elems = (lines, labels)
    plt.legend(*new_legend_elems)

    # final presentation pieces
    ax.set_xticks(())
    ax.set_yticks(())
    ax.set_title(title)

    plt.show()


plot_svm_2D(f"One Class SGD-SVC in 2D, gamma = {gamma}",
            pipe, X_test_reduced, y_test, gamma)

	id	name	est_diameter_min	est_diameter_max	relative_velocity	miss_distance	orbiting_body	sentry_object	absolute_magnitude	hazardous
0	2162635	162635 (2000 SS164)	1.198271	2.679415	13569.249224	5.483974e+07	Earth	False	16.73	False
1	2277475	277475 (2005 WK4)	0.265800	0.594347	73588.726663	6.143813e+07	Earth	False	20.00	True
2	2512244	512244 (2015 YE18)	0.722030	1.614507	114258.692129	4.979872e+07	Earth	False	17.83	False
3	3596030	(2012 BV13)	0.096506	0.215794	24764.303138	2.543497e+07	Earth	False	22.20	False
4	3667127	(2014 GE35)	0.255009	0.570217	42737.733765	4.627557e+07	Earth	False	20.09	True

	name	est_diameter_min	est_diameter_max	relative_velocity	miss_distance	absolute_magnitude	hazardous
0	162635 (2000 SS164)	1.198271	2.679415	13569.249224	5.483974e+07	16.73	False
1	277475 (2005 WK4)	0.265800	0.594347	73588.726663	6.143813e+07	20.00	True
2	512244 (2015 YE18)	0.722030	1.614507	114258.692129	4.979872e+07	17.83	False
3	(2012 BV13)	0.096506	0.215794	24764.303138	2.543497e+07	22.20	False
4	(2014 GE35)	0.255009	0.570217	42737.733765	4.627557e+07	20.09	True

	est_diameter_min	est_diameter_max	relative_velocity	miss_distance	absolute_magnitude	hazardous
0	1.198271	2.679415	13569.249224	5.483974e+07	16.73	False
1	0.265800	0.594347	73588.726663	6.143813e+07	20.00	True
2	0.722030	1.614507	114258.692129	4.979872e+07	17.83	False
3	0.096506	0.215794	24764.303138	2.543497e+07	22.20	False
4	0.255009	0.570217	42737.733765	4.627557e+07	20.09	True

	est_diameter_min	est_diameter_max	relative_velocity	miss_distance	absolute_magnitude
count	90836.000000	90836.000000	90836.000000	9.083600e+04	90836.000000
mean	0.127432	0.284947	48066.918918	3.706655e+07	23.527103
std	0.298511	0.667491	25293.296961	2.235204e+07	2.894086
min	0.000609	0.001362	203.346433	6.745533e+03	9.230000
25%	0.019256	0.043057	28619.020645	1.721082e+07	21.340000
50%	0.048368	0.108153	44190.117890	3.784658e+07	23.700000
75%	0.143402	0.320656	62923.604633	5.654900e+07	25.700000
max	37.892650	84.730541	236990.128088	7.479865e+07	33.200000

	est_diameter_min	relative_velocity	miss_distance	absolute_magnitude
35538	0.038420	91103.489666	6.350550e+07	24.2
40393	0.192555	28359.611312	2.868167e+07	20.7
58540	0.004619	107351.426865	5.388098e+04	28.8
61670	0.015295	21423.536884	5.103884e+07	26.2
11435	0.011603	69856.053840	7.360836e+07	26.8

CS 556 Final Group Project: Near-Earth Objects¶

Jay Talekar, Jaydeep Maganbhai Dobariya, Syed Z. Raza¶

Exploratory Data Analysis¶

Example Rows from the Dataset¶

Missing Data?¶

Redundant Features?¶

Hazardous NEOs Over Time?¶

Feature & Model Selection¶

Data Preprocessing and Data Splitting¶

Checking Correlations¶

Modeling (4 Features)¶

1: SVM (Lagrange Multipliers)¶

Searching for the Best Kernel and Class Weight¶

2: Linear SVM (Gradient Descent)¶

3: Linear SVM (Balanced Class Weights, Gradient Descent)¶

4: Kernel SVM (RBF, Balanced Class Weights, Gradient Descent)¶

5: One Class SVM¶

Dimensionality Reduction using PCA¶

Visualizing, Standardizing and Splitting¶

Visualizing each feature wrt other features using Scatterplot¶

Finding 2 Principal Components with PCA¶

Modeling (2 Features)¶

1: Linear SVM (Gradient Descent & w/o Balanced Class Weights)¶

2: Linear SVM (Gradient Descent & Balanced Class Weights)¶

3: Kernel SVM (RBF, Balanced Class Weights, Gradient Descent¶

4: One Class SVM¶

Performance optimization using other models¶

1: Decision Tree¶

2: Random Forest¶

Conclusion¶

Visualization of PCA Model¶

Before and After PCA - which model is most effective, and why?¶

Next Steps:¶